Xia Fu --07/10/2020
This data set contains booking information for a city hotel and a resort hotel, and includes information such as when the booking was made, length of stay, the number of adults, children, and/or babies, and the number of available parking spaces, among other things. In this project, I built up model to predict the booking cancelation for each reservation, which is a binary classification problem.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from dateutil.parser import parse
from scipy.stats import sem
from sklearn import preprocessing
import sklearn.metrics as sklmetrics
import warnings
warnings.filterwarnings('ignore')
After reading the dataset, I generated the following questions that I am interested with this dataset and want to ask. I will conduct exploratory data analysis to get better understandings to them in next part.
df = pd.read_csv("hotel_bookings.csv")
print(df.shape)
df.head()
# have a look of all the columns
df.columns.to_list()
I created a function to find the number of missing values for each variable and calculate the % of missing value. From the missing value table, there are 4 features that contains missing values.
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# Print some summary information
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
# Return the dataframe with missing information
return mis_val_table_ren_columns
missing_values_table(df)
# drop "company"
df2 = df.drop(["company"], axis =1)
df2.loc[df2["agent"].isnull(), "agent"]= -1
missing_values_table(df2)
df2 = df2.dropna()
print(df2.shape)
df2.head()
missing_values_table(df2)
I conducted some visualizations in order to get better understanding of this dataset.
# the number of bookings for each hotel
sns.countplot(x="hotel", data = df,color="skyblue")
plt.title('Frequency of Hotel'); plt.ylabel('Frequency');
The number of bookings for each month: there is an increase in booking number during spring and fall.
df_y = df
plt.figure(figsize=(10,5), dpi= 100)
month = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
sns.countplot(x="arrival_date_month", data = df.loc[df.arrival_date_year == 2016 ,: ],order = month, palette=("Blues_d"))
plt.title('Month'); plt.ylabel('Frequency');
The number of bookings through each distribution channel: TA/TO used as the distribution channel is the most
sns.countplot(x="distribution_channel", data = df,color="skyblue")
plt.title('Frequency of Hotel'); plt.ylabel('Frequency');
The number of repeated guest(1), and non-repeated guest(0) for each hotel
# take out only the repeated guest records
df_repeated = df.loc[df["is_repeated_guest"]==1, :]
ax = sns.countplot(x="hotel", hue = "is_repeated_guest", data = df,color="skyblue")
plt.title('Number of guest'); plt.ylabel('Frequency');
def without_hue(plot, feature):
total = len(feature)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width() / 2 - 0.05
y = p.get_y() + p.get_height()
ax.annotate(percentage, (x, y), size = 12)
plt.show()
Percentage of repeated guest for each hotel. City Hotel has more repeated guest than Resort Hotel.
ax = sns.countplot(x="hotel", data = df_repeated, color="skyblue")
plt.title('Percentage of repeated guest'); plt.ylabel('Frequency');
without_hue(ax, df.hotel)
Number of guest from each country. We can see that a large amount of people are from Portugal.
# group by country and count the number of records
df_country = df.groupby('country').size().reset_index(name='counts')
# only select the top 10
df_country2 = df_country.nlargest(10, "counts")
ax = sns.barplot(x="country", y = "counts", data = df_country2, color="skyblue")
plt.title('Number of guest'); plt.ylabel('Frequency');
Then, I looked at the daily booking price and draw the time-series graph. We can see that the price is higher in summer than it is in winter.
# change the year, month, and day column to a single date column
df_date = df.copy(deep=False)
d = {"January":1, "February":2, "March":3, "April":4, "May":5, "June":6, "July":7,
"August":8, "September":9, "October":10, "November":11, "December":12 }
df_date.arrival_date_month = df_date.arrival_date_month.map(d)
df_date2 = pd.DataFrame({'year': df_date['arrival_date_year'].to_list(),
'month': df_date['arrival_date_month'].to_list(),
'day': df_date['arrival_date_day_of_month'].to_list()})
df_date["arrival_date"] = pd.to_datetime(df_date2[["year", "month", "day"]])
df_mean = df_date.groupby('arrival_date').adr.mean()
# Plot
plt.figure(figsize=(20,10), dpi= 80)
plt.ylabel("price", fontsize=16)
plt.xlabel("date", fontsize=16)
x = [d.date().strftime('%Y-%m-%d') for d in df_mean.index]
plt.plot(x, df_mean, color="red", lw=2)
plt.axis([0, 800, 0, 200])
xtick_labels = [d.date().strftime('%Y-%m-%d')[-10:-3] for d in df_mean.index.tolist()[::30]]
df_sam = df_mean.reset_index()
xtick_location = df_sam.index.tolist()[::30]
plt.xticks(ticks=xtick_location, labels=xtick_labels, rotation=0, fontsize=10, horizontalalignment='center', alpha=.7)
# Decorations
# Lighten borders
plt.gca().spines["top"].set_alpha(0)
plt.gca().spines["bottom"].set_alpha(1)
plt.gca().spines["right"].set_alpha(0)
plt.gca().spines["left"].set_alpha(1)
plt.title("Daily Booking Price", fontsize=20)
# Axis limits
s, e = plt.gca().get_xlim()
# Draw Horizontal Tick lines
for y in range(10, 200, 5):
plt.hlines(y, xmin=s, xmax=e,colors='black', alpha=0.5, linestyles="--", lw=0.5)
plt.show()
A comparison of the percentage of the reservation that was canceled by passengers for each hotel. City hotel has a higher percentage of cancelation.
# select the canceled records
df_cancel=df.loc[df.is_canceled == 1, :]
print("percentage of cancelation", len(df_cancel)/len(df2))
ax = sns.countplot(x="hotel", data = df_cancel, color="green")
plt.title('Canceled reservation'); plt.ylabel('Frequency');
without_hue(ax, df.hotel)
Linear relationship between the percentage of canceled reservation and lead time.
# lead time
# group by lead time and select only the canceled record
lead_cancel_data1 = df.groupby("lead_time")["reservation_status"].describe()
lead_cancel_data2 = lead_cancel_data1.loc[lead_cancel_data1.top == "Canceled", : ]
# create a new column called percentage, which is the rate of canceled booking over all the bookings with certain lead time point
lead_cancel_data2["percentage"] = (lead_cancel_data2["freq"]/lead_cancel_data2["count"] )* 100
plt.figure(figsize=(15,10))
sns.regplot(x=lead_cancel_data2.index.values, y = lead_cancel_data2.percentage.values)
plt.title("correlation between lead time and cancelation", fontsize = 20)
plt.xlabel("lead time", fontsize = 15)
plt.ylabel("cancelation percentage", fontsize = 15)
plt.ylim(20,110)
plt.show()
No obvious relations between the percentage of cancelation and time
df_cancel_count = df_date.groupby('arrival_date').is_canceled.sum()
df_cancel_count= df_cancel_count.to_frame()
df_total = df_date.groupby('arrival_date').is_canceled.count()
df_total= df_total.to_frame()
df_cancel_count['perc_cancel']= df_cancel_count["is_canceled"]/df_total["is_canceled"] * 100
df_cancel_count=df_cancel_count.drop(['is_canceled'], axis=1)
# Plot
plt.figure(figsize=(20,10), dpi= 80)
plt.ylabel("Percentage of cancelation", fontsize=16)
plt.xlabel("date", fontsize=16)
x = [d.date().strftime('%Y-%m-%d') for d in df_cancel_count.index]
plt.plot(x, df_cancel_count, color="blue", lw=2)
plt.axis([0, 800, 0, 100])
xtick_labels = [d.date().strftime('%Y-%m-%d')[-10:-3] for d in df_cancel_count.index.tolist()[::30]]
df_sam = df_cancel_count.reset_index()
xtick_location = df_sam.index.tolist()[::30]
plt.xticks(ticks=xtick_location, labels=xtick_labels, rotation=0, fontsize=10, horizontalalignment='center', alpha=.7)
# Decorations
# Lighten borders
plt.gca().spines["top"].set_alpha(0)
plt.gca().spines["bottom"].set_alpha(1)
plt.gca().spines["right"].set_alpha(0)
plt.gca().spines["left"].set_alpha(1)
# Axis limits
s, e = plt.gca().get_xlim()
# Draw Horizontal Tick lines
for y in range(10, 200, 5):
plt.hlines(y, xmin=s, xmax=e,colors='black', alpha=0.5, linestyles="--", lw=0.5)
plt.show()
df3 = df2.drop(["reservation_status"], axis = 1)
df3 = df3.drop(["reservation_status_date"], axis = 1)
map = {
"arrival_date_month": {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December":12
}
}
df3 = df3.replace(map)
print(df3.shape)
df3.head()
le = preprocessing.LabelEncoder()
df3["meal"] = le.fit_transform(df3["meal"])
df3["country"] = le.fit_transform(df3["country"])
df3["market_segment"] = le.fit_transform(df3["market_segment"])
df3["distribution_channel"] = le.fit_transform(df3["distribution_channel"])
df3['reserved_room_type']= le.fit_transform(df3["reserved_room_type"])
df3['assigned_room_type']=le.fit_transform(df3["assigned_room_type"])
feat_dummy=["deposit_type", "customer_type", "hotel"]
df_dum = pd.get_dummies(df3[feat_dummy], drop_first=True)
df4 = pd.concat([df3, df_dum], axis=1)
df4 = df4.drop(feat_dummy, axis = 1)
print(df4.shape)
df4.head()
Standardize only those numerical variables: features_s
columns = df4.columns
features = columns.drop('is_canceled')
features_s = columns.drop('is_canceled').drop(df_dum.columns)
from sklearn.preprocessing import StandardScaler
sc =StandardScaler()
df4[features_s] =sc.fit_transform(df4[features_s])
print(df4.shape)
df4.head()
df_x = df4[features]
df_y = df4["is_canceled"]
from sklearn.decomposition import PCA
pca = PCA().fit(df_x)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
I selected 25 features from 32 using backward feature selection with random forest classifier, cross validation and scoring by ROC_AUC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs2 = SFS(RandomForestClassifier(),
k_features=25,
forward=False,
floating=False,
scoring='roc_auc',
n_jobs = -1,
cv=2)
sfs2= sfs2.fit(np.array(df_x.fillna(0)), df_y)
selected_feat= df_x.columns[list(sfs2.k_feature_idx_)]
selected_feat
Identify any features that has high correlation with another and drop them.
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(df4[selected_feat].corr(),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
drop_col = ['assigned_room_type','market_segment', 'reserved_room_type']
x_new = selected_feat.drop(drop_col)
x_new
X = df4[x_new]
Y = df4["is_canceled"]
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X,Y,train_size=0.7, random_state = 1)
print ('Train Features: ',train_X.shape ,
'Test Features: ',test_X.shape)
print ('Train Labels: ',train_y.shape ,
'Test Labels: ',test_y.shape)
df_clean = pd.concat([X, Y], axis=1, sort=False)
df_clean.to_csv("df_clean")
Make a null model before starting other models to have an idea of the benchmark.
Set all dependent variable values as 0 (none was canceled), which means the accuracy of my other models should not be lower than 0.6299.
y_null = np.zeros([len(test_y), 1])
print("Accuracy" , sklmetrics.accuracy_score(test_y, y_null))
roc_auc = roc_auc_score(test_y, y_null)
print("Area under the ROC curve : %f" % roc_auc)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear')
lr.fit(train_X, train_y)
predict_y = lr.predict(test_X)
print("Accuracy:", lr.score(test_X, test_y))
roc_auc = roc_auc_score(test_y, predict_y)
print("Area under the ROC curve : %f" % roc_auc)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(test_y, predict_y, labels =[0,1])
print("confusion matrix")
conf_mat
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 20],
'penalty': [ 'l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)
grid_search.fit(train_X, train_y)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.5f}".format(grid_search.best_score_))
print("Accuracy:", grid_search.score(test_X, test_y))
predict_y2 = grid_search.predict(test_X)
roc_auc = roc_auc_score(test_y, predict_y2)
print("Area under the ROC curve : %f" % roc_auc)
conf_mat = confusion_matrix(test_y, predict_y2, labels =[0,1])
print("confusion matrix")
print(conf_mat)
Logistic regression with optimized hyperparameter didn't effectively improve the accuracy.
from sklearn.tree import DecisionTreeClassifier
param = {'criterion':['gini', 'entropy'], 'max_depth':10.**np.arange(1, 4),
'max_leaf_nodes':[300, 350, 400, 450, 500]}
grid_dt = GridSearchCV(DecisionTreeClassifier(), param, cv=10)
grid_dt.fit(train_X, train_y)
print("Best parameters: {}".format(grid_dt.best_params_))
print("Best cross-validation score: {:.5f}".format(grid_dt.best_score_))
print("Accuracy:", grid_dt.score(test_X, test_y))
predict_y3= grid_dt.predict(test_X)
roc_auc = roc_auc_score(test_y, predict_y3)
print("Area under the ROC curve : %f" % roc_auc)
conf_mat = confusion_matrix(test_y, predict_y3, labels =[0,1])
print("confusion matrix")
conf_mat
dtc = DecisionTreeClassifier(criterion = 'gini', max_depth=1000, max_leaf_nodes = 350)
dtc.fit(train_X, train_y)
fi=pd.Series(dtc.feature_importances_, index=train_X.columns)
fn=fi.sort_values(ascending=True)
fn.plot(kind='barh', color='r', figsize=(10, 6))
plt.xlabel('importance', size=10)
plt.title('Decision Tree Importance', size=15)
plt.tick_params(labelsize=10)
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
#feature_cols = ['deposit_type_Non Refund', 'agent']
dot_data = StringIO()
export_graphviz(dtc, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = x_new,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())